/* -*- c -*- */

#define _UMATHMODULE
#define _MULTIARRAYMODULE
#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include "Python.h"

#include "npy_config.h"
#include "numpy/npy_common.h"
#include "numpy/arrayobject.h"
#include "numpy/ufuncobject.h"
#include "numpy/npy_math.h"
#include "numpy/halffloat.h"
#include "lowlevel_strided_loops.h"

#include "npy_pycompat.h"

#include "ufunc_object.h"

#include <string.h> /* for memchr */

/*
 * cutoff blocksize for pairwise summation
 * decreasing it decreases errors slightly as more pairs are summed but
 * also lowers performance, as the inner loop is unrolled eight times it is
 * effectively 16
 */
#define PW_BLOCKSIZE    128


/*
 * largest simd vector size in bytes numpy supports
 * it is currently a extremely large value as it is only used for memory
 * overlap checks
 */
#ifndef NPY_MAX_SIMD_SIZE
#define NPY_MAX_SIMD_SIZE 1024
#endif

/*
 * include vectorized functions and dispatchers
 * this file is safe to include also for generic builds
 * platform specific instructions are either masked via the proprocessor or
 * runtime detected
 */
#include "simd.inc"

/** Provides the various *_LOOP macros */
#include "fast_loop_macros.h"


/******************************************************************************
 **                          GENERIC FLOAT LOOPS                             **
 *****************************************************************************/

/* direct loops using a suitable callback */

/**begin repeat
 * #c = e, f, d, g#
 * #type = npy_half, npy_float, npy_double, npy_longdouble#
 **/

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_@c@_@c@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
{
    typedef @type@ func_type(@type@);
    func_type *f = (func_type *)func;
    UNARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        *(@type@ *)op1 = f(in1);
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_@c@@c@_@c@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
{
    typedef @type@ func_type(@type@, @type@);
    func_type *f = (func_type *)func;
    BINARY_LOOP {
        @type@ in1 = *(@type@ *)ip1;
        @type@ in2 = *(@type@ *)ip2;
        *(@type@ *)op1 = f(in1, in2);
    }
}

/**end repeat**/

/* indirect loops with casting */
/**begin repeat
 * #c1    = e,         e,          f#
 * #type1 = npy_half,  npy_half,   npy_float#
 * #c2    = f,         d,          d#
 * #type2 = npy_float, npy_double, npy_double#
 *
 * #conv12  = npy_half_to_float, npy_half_to_double, (double)#
 * #conv21  = npy_float_to_half, npy_double_to_half, (float)#
 **/

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_@c1@_@c1@_As_@c2@_@c2@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
{
    typedef @type2@ func_type(@type2@);
    func_type *f = (func_type *)func;
    UNARY_LOOP {
        const @type2@ in1 = @conv12@(*(@type1@ *)ip1);
        *(@type1@ *)op1 = @conv21@(f(in1));
    }
}
/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_@c1@@c1@_@c1@_As_@c2@@c2@_@c2@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
{
    typedef @type2@ func_type(@type2@, @type2@);
    func_type *f = (func_type *)func;
    BINARY_LOOP {
        const @type2@ in1 = @conv12@(*(@type1@ *)ip1);
        const @type2@ in2 = @conv12@(*(@type1@ *)ip2);
        *(@type1@ *)op1 = @conv21@(f(in1, in2));
    }
}

/**end repeat**/

/******************************************************************************
 **                          GENERIC COMPLEX LOOPS                           **
 *****************************************************************************/

/* direct loops using a suitable callback */
/**begin repeat
 * #c = F, D, G#
 * #type = npy_cfloat, npy_cdouble, npy_clongdouble#
 **/

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_@c@_@c@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
{
    typedef void func_type(@type@ *, @type@ *);
    func_type *f = (func_type *)func;
    UNARY_LOOP {
        @type@ in1 = *(@type@ *)ip1;
        @type@ *out = (@type@ *)op1;
        f(&in1, out);
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_@c@@c@_@c@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
{
    typedef void func_type(@type@ *, @type@ *, @type@ *);
    func_type *f = (func_type *)func;
    BINARY_LOOP {
        @type@ in1 = *(@type@ *)ip1;
        @type@ in2 = *(@type@ *)ip2;
        @type@ *out = (@type@ *)op1;
        f(&in1, &in2, out);
    }
}
/**end repeat**/


/* indirect loops with casting */
/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_F_F_As_D_D(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
{
    typedef void func_type(npy_cdouble *, npy_cdouble *);
    func_type *f = (func_type *)func;
    UNARY_LOOP {
        npy_cdouble tmp, out;
        tmp.real = (double)((float *)ip1)[0];
        tmp.imag = (double)((float *)ip1)[1];
        f(&tmp, &out);
        ((float *)op1)[0] = (float)out.real;
        ((float *)op1)[1] = (float)out.imag;
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_FF_F_As_DD_D(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
{
    typedef void func_type(npy_cdouble *, npy_cdouble *, npy_cdouble *);
    func_type *f = (func_type *)func;
    BINARY_LOOP {
        npy_cdouble tmp1, tmp2, out;
        tmp1.real = (double)((float *)ip1)[0];
        tmp1.imag = (double)((float *)ip1)[1];
        tmp2.real = (double)((float *)ip2)[0];
        tmp2.imag = (double)((float *)ip2)[1];
        f(&tmp1, &tmp2, &out);
        ((float *)op1)[0] = (float)out.real;
        ((float *)op1)[1] = (float)out.imag;
    }
}


/******************************************************************************
 **                         GENERIC OBJECT lOOPS                             **
 *****************************************************************************/

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_O_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
{
    unaryfunc f = (unaryfunc)func;
    UNARY_LOOP {
        PyObject *in1 = *(PyObject **)ip1;
        PyObject **out = (PyObject **)op1;
        PyObject *ret = f(in1 ? in1 : Py_None);
        if (ret == NULL) {
            return;
        }
        Py_XDECREF(*out);
        *out = ret;
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_O_O_method(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
{
    char *meth = (char *)func;
    PyObject *tup = PyTuple_New(0);
    if (tup == NULL) {
        return;
    }
    UNARY_LOOP {
        PyObject *in1 = *(PyObject **)ip1;
        PyObject **out = (PyObject **)op1;
        PyObject *ret, *func;
        func = PyObject_GetAttrString(in1 ? in1 : Py_None, meth);
        if (func != NULL && !PyCallable_Check(func)) {
            Py_DECREF(func);
            func = NULL;
        }
        if (func == NULL) {
            PyObject *exc, *val, *tb;
            PyTypeObject *type = in1 ? Py_TYPE(in1) : Py_TYPE(Py_None);
            PyErr_Fetch(&exc, &val, &tb);
            PyErr_Format(PyExc_TypeError,
                         "loop of ufunc does not support argument %d of "
                         "type %s which has no callable %s method",
                         i, type->tp_name, meth);
            npy_PyErr_ChainExceptionsCause(exc, val, tb);
            Py_DECREF(tup);
            Py_XDECREF(func);
            return;
        }
        ret = PyObject_Call(func, tup, NULL);
        Py_DECREF(func);
        if (ret == NULL) {
            Py_DECREF(tup);
            return;
        }
        Py_XDECREF(*out);
        *out = ret;
    }
    Py_DECREF(tup);
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_OO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
{
    binaryfunc f = (binaryfunc)func;
    BINARY_LOOP {
        PyObject *in1 = *(PyObject **)ip1;
        PyObject *in2 = *(PyObject **)ip2;
        PyObject **out = (PyObject **)op1;
        PyObject *ret = f(in1 ? in1 : Py_None, in2 ? in2 : Py_None);
        if (ret == NULL) {
            return;
        }
        Py_XDECREF(*out);
        *out = ret;
    }
}

NPY_NO_EXPORT void
PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
{
    ternaryfunc f = (ternaryfunc)func;
    TERNARY_LOOP {
        PyObject *in1 = *(PyObject **)ip1;
        PyObject *in2 = *(PyObject **)ip2;
        PyObject *in3 = *(PyObject **)ip3;
        PyObject **out = (PyObject **)op1;
        PyObject *ret = f(
            in1 ? in1 : Py_None,
            in2 ? in2 : Py_None,
            in3 ? in3 : Py_None
        );
        if (ret == NULL) {
            return;
        }
        Py_XDECREF(*out);
        *out = ret;
    }
}

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_OO_O_method(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
{
    char *meth = (char *)func;
    BINARY_LOOP {
        PyObject *in1 = *(PyObject **)ip1;
        PyObject *in2 = *(PyObject **)ip2;
        PyObject **out = (PyObject **)op1;
        PyObject *ret = PyObject_CallMethod(in1 ? in1 : Py_None,
                                            meth, "(O)", in2);
        if (ret == NULL) {
            return;
        }
        Py_XDECREF(*out);
        *out = ret;
    }
}

/*
 * A general-purpose ufunc that deals with general-purpose Python callable.
 * func is a structure with nin, nout, and a Python callable function
 */

/*UFUNC_API*/
NPY_NO_EXPORT void
PyUFunc_On_Om(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
{
    npy_intp n =  dimensions[0];
    PyUFunc_PyFuncData *data = (PyUFunc_PyFuncData *)func;
    int nin = data->nin;
    int nout = data->nout;
    PyObject *tocall = data->callable;
    char *ptrs[NPY_MAXARGS];
    PyObject *arglist, *result;
    PyObject *in, **op;
    npy_intp i, j, ntot;

    ntot = nin+nout;

    for(j = 0; j < ntot; j++) {
        ptrs[j] = args[j];
    }
    for(i = 0; i < n; i++) {
        arglist = PyTuple_New(nin);
        if (arglist == NULL) {
            return;
        }
        for(j = 0; j < nin; j++) {
            in = *((PyObject **)ptrs[j]);
            if (in == NULL) {
                in = Py_None;
            }
            PyTuple_SET_ITEM(arglist, j, in);
            Py_INCREF(in);
        }
        result = PyEval_CallObject(tocall, arglist);
        Py_DECREF(arglist);
        if (result == NULL) {
            return;
        }
        if (nout == 0  && result == Py_None) {
            /* No output expected, no output received, continue */
            Py_DECREF(result);
        }
        else if (nout == 1) {
            /* Single output expected, assign and continue */
            op = (PyObject **)ptrs[nin];
            Py_XDECREF(*op);
            *op = result;
        }
        else if (PyTuple_Check(result) && nout == PyTuple_Size(result)) {
            /*
             * Multiple returns match expected number of outputs, assign
             * and continue. Will also gobble empty tuples if nout == 0.
             */
            for(j = 0; j < nout; j++) {
                op = (PyObject **)ptrs[j+nin];
                Py_XDECREF(*op);
                *op = PyTuple_GET_ITEM(result, j);
                Py_INCREF(*op);
            }
            Py_DECREF(result);
        }
        else {
            /* Mismatch between returns and expected outputs, exit */
            Py_DECREF(result);
            return;
        }
        for(j = 0; j < ntot; j++) {
            ptrs[j] += steps[j];
        }
    }
}

/*
 *****************************************************************************
 **                             BOOLEAN LOOPS                               **
 *****************************************************************************
 */

/**begin repeat
 * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
 * #OP =  ==, !=, >, >=, <, <=#
 **/

NPY_NO_EXPORT void
BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
        *((npy_bool *)op1)= in1 @OP@ in2;
    }
}
/**end repeat**/


/**begin repeat
 * #kind = logical_and, logical_or#
 * #OP =  &&, ||#
 * #SC =  ==, !=#
 * #and = 1, 0#
 **/

NPY_NO_EXPORT void
BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if(IS_BINARY_REDUCE) {
#ifdef NPY_HAVE_SSE2_INTRINSICS
        /*
         * stick with our variant for more reliable performance, only known
         * platform which outperforms it by ~20% is an i7 with glibc 2.17
         */
        if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) {
            return;
        }
#else
        /* for now only use libc on 32-bit/non-x86 */
        if (steps[1] == 1) {
            npy_bool * op = (npy_bool *)args[0];
#if @and@
            /* np.all(), search for a zero (false) */
            if (*op) {
                *op = memchr(args[1], 0, dimensions[0]) == NULL;
            }
#else
            /*
             * np.any(), search for a non-zero (true) via comparing against
             * zero blocks, memcmp is faster than memchr on SSE4 machines
             * with glibc >= 2.12 and memchr can only check for equal 1
             */
            static const npy_bool zero[4096]; /* zero by C standard */
            npy_uintp i, n = dimensions[0];

            for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
                *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
            }
            if (!*op && n - i > 0) {
                *op = memcmp(&args[1][i], zero, n - i) != 0;
            }
#endif
            return;
        }
#endif
        else {
            BINARY_REDUCE_LOOP(npy_bool) {
                const npy_bool in2 = *(npy_bool *)ip2;
                io1 = io1 @OP@ in2;
                if (io1 @SC@ 0) {
                    break;
                }
            }
            *((npy_bool *)iop1) = io1;
        }
    }
    else {
        if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) {
            return;
        }
        else {
            BINARY_LOOP {
                const npy_bool in1 = *(npy_bool *)ip1;
                const npy_bool in2 = *(npy_bool *)ip2;
                *((npy_bool *)op1) = in1 @OP@ in2;
            }
        }
    }
}
/**end repeat**/

/**begin repeat
 * #kind = absolute, logical_not#
 * #OP =  !=, ==#
 **/
NPY_NO_EXPORT void
BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) {
        return;
    }
    else {
        UNARY_LOOP {
            npy_bool in1 = *(npy_bool *)ip1;
            *((npy_bool *)op1) = in1 @OP@ 0;
        }
    }
}
/**end repeat**/

NPY_NO_EXPORT void
BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    OUTPUT_LOOP {
        *((npy_bool *)op1) = 1;
    }
}


/**begin repeat
 * #kind = isnan, isinf, isfinite#
 * #func = npy_isnan, npy_isinf, npy_isfinite#
 * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
 **/
NPY_NO_EXPORT void
BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    /*
     * The (void)in; suppresses an unused variable warning raised by gcc and allows
     * us to re-use this macro even though we do not depend on in
     */
    UNARY_LOOP_FAST(npy_bool, npy_bool, (void)in; *out = @val@);
}

/**end repeat**/

/*
 *****************************************************************************
 **                           INTEGER LOOPS
 *****************************************************************************
 */

/**begin repeat
 * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
 *         LONG, ULONG, LONGLONG, ULONGLONG#
 * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
 *         npy_long, npy_ulong, npy_longlong, npy_ulonglong#
 * #ftype = npy_float, npy_float, npy_float, npy_float, npy_double, npy_double,
 *          npy_double, npy_double, npy_double, npy_double#
 * #SIGNED = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0#
 * #c = hh,uhh,h,uh,,u,l,ul,ll,ull#
 */

#define @TYPE@_floor_divide @TYPE@_divide
#define @TYPE@_fmax @TYPE@_maximum
#define @TYPE@_fmin @TYPE@_minimum

NPY_NO_EXPORT void
@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    OUTPUT_LOOP {
        *((@type@ *)op1) = 1;
    }
}

NPY_NO_EXPORT void
@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_FAST(@type@, @type@, *out = +in);
}

/**begin repeat1
 * #isa = , _avx2#
 * #ISA = , AVX2#
 * #CHK = 1, HAVE_ATTRIBUTE_TARGET_AVX2#
 * #ATTR = , NPY_GCC_TARGET_AVX2#
 */

#if @CHK@
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    UNARY_LOOP_FAST(@type@, @type@, *out = in * in);
}
#endif

#if @CHK@
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in);
}
#endif

#if @CHK@
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_FAST(@type@, @type@, *out = in);
}
#endif

#if @CHK@
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_FAST(@type@, @type@, *out = -in);
}
#endif

#if @CHK@
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_FAST(@type@, npy_bool, *out = !in);
}
#endif

#if @CHK@
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_FAST(@type@, @type@, *out = ~in);
}
#endif

/**begin repeat2
 * Arithmetic
 * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor#
 * #OP = +, -, *, &, |, ^#
 */

#if @CHK@
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
        BINARY_REDUCE_LOOP(@type@) {
            io1 @OP@= *(@type@ *)ip2;
        }
        *((@type@ *)iop1) = io1;
    }
    else {
        BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
    }
}
#endif

/**end repeat2**/

/*
 * Arithmetic bit shift operations.
 *
 * Intel hardware masks bit shift values, so large shifts wrap around
 * and can produce surprising results. The special handling ensures that
 * behavior is independent of compiler or hardware.
 * TODO: We could implement consistent behavior for negative shifts,
 *       which is undefined in C.
 */

#define INT_left_shift_needs_clear_floatstatus
#define UINT_left_shift_needs_clear_floatstatus

NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_left_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps,
                  void *NPY_UNUSED(func))
{
    BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));

#ifdef @TYPE@_left_shift_needs_clear_floatstatus
    // For some reason, our macOS CI sets an "invalid" flag here, but only
    // for some types.
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#undef INT_left_shift_needs_clear_floatstatus
#undef UINT_left_shift_needs_clear_floatstatus

NPY_NO_EXPORT
#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
NPY_GCC_OPT_3
#endif
void
@TYPE@_right_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps,
                   void *NPY_UNUSED(func))
{
    BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
}


/**begin repeat2
 * #kind = equal, not_equal, greater, greater_equal, less, less_equal,
 *         logical_and, logical_or#
 * #OP =  ==, !=, >, >=, <, <=, &&, ||#
 */

#if @CHK@
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    /*
     * gcc vectorization of this is not good (PR60575) but manual integer
     * vectorization is too tedious to be worthwhile
     */
    BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2);
}
#endif

/**end repeat2**/

#if @CHK@
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const int t1 = !!*(@type@ *)ip1;
        const int t2 = !!*(@type@ *)ip2;
        *((npy_bool *)op1) = (t1 != t2);
    }
}
#endif

/**end repeat1**/

/**begin repeat1
 * #kind = maximum, minimum#
 * #OP =  >, <#
 **/

NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
        BINARY_REDUCE_LOOP(@type@) {
            const @type@ in2 = *(@type@ *)ip2;
            io1 = (io1 @OP@ in2) ? io1 : in2;
        }
        *((@type@ *)iop1) = io1;
    }
    else {
        BINARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            const @type@ in2 = *(@type@ *)ip2;
            *((@type@ *)op1) = (in1 @OP@ in2) ? in1 : in2;
        }
    }
}

/**end repeat1**/

NPY_NO_EXPORT void
@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        @type@ in1 = *(@type@ *)ip1;
        @type@ in2 = *(@type@ *)ip2;
        @type@ out;

#if @SIGNED@
        if (in2 < 0) {
            NPY_ALLOW_C_API_DEF
            NPY_ALLOW_C_API;
            PyErr_SetString(PyExc_ValueError,
                    "Integers to negative integer powers are not allowed.");
            NPY_DISABLE_C_API;
            return;
        }
#endif
        if (in2 == 0) {
            *((@type@ *)op1) = 1;
            continue;
        }
        if (in1 == 1) {
            *((@type@ *)op1) = 1;
            continue;
        }

        out = in2 & 1 ? in1 : 1;
        in2 >>= 1;
        while (in2 > 0) {
            in1 *= in1;
            if (in2 & 1) {
                out *= in1;
            }
            in2 >>= 1;
        }
        *((@type@ *) op1) = out;
    }
}

NPY_NO_EXPORT void
@TYPE@_fmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        if (in2 == 0) {
            npy_set_floatstatus_divbyzero();
            *((@type@ *)op1) = 0;
        }
        else {
            *((@type@ *)op1)= in1 % in2;
        }

    }
}

/**begin repeat1
 * #kind = isnan, isinf, isfinite#
 * #func = npy_isnan, npy_isinf, npy_isfinite#
 * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
 **/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    /*
     * The (void)in; suppresses an unused variable warning raised by gcc and allows
     * us to re-use this macro even though we do not depend on in
     */
    UNARY_LOOP_FAST(@type@, npy_bool, (void)in; *out = @val@);
}
/**end repeat1**/

/**end repeat**/

/**begin repeat
 * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
 * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
 * #c    = ,,,l,ll#
 */

NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_FAST(@type@, @type@, *out = (in >= 0) ? in : -in);
}

NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
}

NPY_NO_EXPORT void
@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        /*
         * FIXME: On x86 at least, dividing the smallest representable integer
         * by -1 causes a SIFGPE (division overflow). We treat this case here
         * (to avoid a SIGFPE crash at python level), but a good solution would
         * be to treat integer division problems separately from FPU exceptions
         * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
         */
        if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) {
            npy_set_floatstatus_divbyzero();
            *((@type@ *)op1) = 0;
        }
        else if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) {
            *((@type@ *)op1) = in1/in2 - 1;
        }
        else {
            *((@type@ *)op1) = in1/in2;
        }
    }
}

NPY_NO_EXPORT void
@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        if (in2 == 0) {
            npy_set_floatstatus_divbyzero();
            *((@type@ *)op1) = 0;
        }
        else {
            /* handle mixed case the way Python does */
            const @type@ rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((@type@ *)op1) = rem;
            }
            else {
                *((@type@ *)op1) = rem + in2;
            }
        }
    }
}

NPY_NO_EXPORT void
@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP_TWO_OUT {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        /* see FIXME note for divide above */
        if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) {
            npy_set_floatstatus_divbyzero();
            *((@type@ *)op1) = 0;
            *((@type@ *)op2) = 0;
        }
        else {
            /* handle mixed case the way Python does */
            const @type@ quo = in1 / in2;
            const @type@ rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((@type@ *)op1) = quo;
                *((@type@ *)op2) = rem;
            }
            else {
                *((@type@ *)op1) = quo - 1;
                *((@type@ *)op2) = rem + in2;
            }
        }
    }
}

/**begin repeat1
 * #kind = gcd, lcm#
 **/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        *((@type@ *)op1) = npy_@kind@@c@(in1, in2);
    }
}
/**end repeat1**/

/**end repeat**/

/**begin repeat
 * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
 * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
 * #c    = u,u,u,ul,ull#
 */

NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_FAST(@type@, @type@, *out = in);
}

NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0);
}

NPY_NO_EXPORT void
@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        if (in2 == 0) {
            npy_set_floatstatus_divbyzero();
            *((@type@ *)op1) = 0;
        }
        else {
            *((@type@ *)op1)= in1/in2;
        }
    }
}

NPY_NO_EXPORT void
@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        if (in2 == 0) {
            npy_set_floatstatus_divbyzero();
            *((@type@ *)op1) = 0;
        }
        else {
            *((@type@ *)op1) = in1 % in2;
        }
    }
}

NPY_NO_EXPORT void
@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP_TWO_OUT {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        if (in2 == 0) {
            npy_set_floatstatus_divbyzero();
            *((@type@ *)op1) = 0;
            *((@type@ *)op2) = 0;
        }
        else {
            *((@type@ *)op1)= in1/in2;
            *((@type@ *)op2) = in1 % in2;
        }
    }
}

/**begin repeat1
 * #kind = gcd, lcm#
 **/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        *((@type@ *)op1) = npy_@kind@@c@(in1, in2);
    }
}
/**end repeat1**/

/**end repeat**/

/*
 *****************************************************************************
 **                           DATETIME LOOPS                                **
 *****************************************************************************
 */

NPY_NO_EXPORT void
TIMEDELTA_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        if (in1 == NPY_DATETIME_NAT) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_timedelta *)op1) = -in1;
        }
    }
}

NPY_NO_EXPORT void
TIMEDELTA_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        *((npy_timedelta *)op1) = +in1;
    }
}

NPY_NO_EXPORT void
TIMEDELTA_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        if (in1 == NPY_DATETIME_NAT) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_timedelta *)op1) = (in1 >= 0) ? in1 : -in1;
        }
    }
}

NPY_NO_EXPORT void
TIMEDELTA_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        *((npy_timedelta *)op1) = in1 > 0 ? 1 : (in1 < 0 ? -1 : 0);
    }
}

/**begin repeat
 * #type = npy_datetime, npy_timedelta#
 * #TYPE = DATETIME, TIMEDELTA#
 */

NPY_NO_EXPORT void
@TYPE@_isnat(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        *((npy_bool *)op1) = (in1 == NPY_DATETIME_NAT);
    }
}

NPY_NO_EXPORT void
@TYPE@_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        *((npy_bool *)op1) = (in1 != NPY_DATETIME_NAT);
    }
}

NPY_NO_EXPORT void
@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_FAST(npy_bool, npy_bool, (void)in; *out = NPY_FALSE);
}

NPY_NO_EXPORT void
@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    OUTPUT_LOOP {
        *((@type@ *)op1) = 1;
    }
}

/**begin repeat1
 * #kind = equal, greater, greater_equal, less, less_equal#
 * #OP =  ==, >, >=, <, <=#
 */
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        *((npy_bool *)op1) = (in1 @OP@ in2 &&
                              in1 != NPY_DATETIME_NAT &&
                              in2 != NPY_DATETIME_NAT);
    }
}
/**end repeat1**/

NPY_NO_EXPORT void
@TYPE@_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        *((npy_bool *)op1) = (in1 != in2 ||
                              in1 == NPY_DATETIME_NAT ||
                              in2 == NPY_DATETIME_NAT);
    }
}


/**begin repeat1
 * #kind = maximum, minimum#
 * #OP =  >, <#
 **/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        if (in1 == NPY_DATETIME_NAT) {
            *((@type@ *)op1) = in1;
        }
        else if (in2 == NPY_DATETIME_NAT) {
            *((@type@ *)op1) = in2;
        }
        else {
            *((@type@ *)op1) = (in1 @OP@ in2) ? in1 : in2;
        }
    }
}
/**end repeat1**/

/**begin repeat1
 * #kind = fmax, fmin#
 * #OP =  >=, <=#
 **/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        if (in1 == NPY_DATETIME_NAT) {
            *((@type@ *)op1) = in2;
        }
        else if (in2 == NPY_DATETIME_NAT) {
            *((@type@ *)op1) = in1;
        }
        else {
            *((@type@ *)op1) = in1 @OP@ in2 ? in1 : in2;
        }
    }
}
/**end repeat1**/

/**end repeat**/

NPY_NO_EXPORT void
DATETIME_Mm_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    BINARY_LOOP {
        const npy_datetime in1 = *(npy_datetime *)ip1;
        const npy_timedelta in2 = *(npy_timedelta *)ip2;
        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
            *((npy_datetime *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_datetime *)op1) = in1 + in2;
        }
    }
}

NPY_NO_EXPORT void
DATETIME_mM_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        const npy_datetime in2 = *(npy_datetime *)ip2;
        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
            *((npy_datetime *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_datetime *)op1) = in1 + in2;
        }
    }
}

NPY_NO_EXPORT void
TIMEDELTA_mm_m_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        const npy_timedelta in2 = *(npy_timedelta *)ip2;
        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_timedelta *)op1) = in1 + in2;
        }
    }
}

NPY_NO_EXPORT void
DATETIME_Mm_M_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_datetime in1 = *(npy_datetime *)ip1;
        const npy_timedelta in2 = *(npy_timedelta *)ip2;
        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
            *((npy_datetime *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_datetime *)op1) = in1 - in2;
        }
    }
}

NPY_NO_EXPORT void
DATETIME_MM_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_datetime in1 = *(npy_datetime *)ip1;
        const npy_datetime in2 = *(npy_datetime *)ip2;
        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_timedelta *)op1) = in1 - in2;
        }
    }
}

NPY_NO_EXPORT void
TIMEDELTA_mm_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        const npy_timedelta in2 = *(npy_timedelta *)ip2;
        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_timedelta *)op1) = in1 - in2;
        }
    }
}

/* Note: Assuming 'q' == NPY_LONGLONG */
NPY_NO_EXPORT void
TIMEDELTA_mq_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        const npy_int64 in2 = *(npy_int64 *)ip2;
        if (in1 == NPY_DATETIME_NAT) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_timedelta *)op1) = in1 * in2;
        }
    }
}

/* Note: Assuming 'q' == NPY_LONGLONG */
NPY_NO_EXPORT void
TIMEDELTA_qm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_int64 in1 = *(npy_int64 *)ip1;
        const npy_timedelta in2 = *(npy_timedelta *)ip2;
        if (in2 == NPY_DATETIME_NAT) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_timedelta *)op1) = in1 * in2;
        }
    }
}

NPY_NO_EXPORT void
TIMEDELTA_md_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        const double in2 = *(double *)ip2;
        if (in1 == NPY_DATETIME_NAT) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            double result = in1 * in2;
            if (npy_isfinite(result)) {
                *((npy_timedelta *)op1) = (npy_timedelta)result;
            }
            else {
                *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
            }
        }
    }
}

NPY_NO_EXPORT void
TIMEDELTA_dm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const double in1 = *(double *)ip1;
        const npy_timedelta in2 = *(npy_timedelta *)ip2;
        if (in2 == NPY_DATETIME_NAT) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            double result = in1 * in2;
            if (npy_isfinite(result)) {
                *((npy_timedelta *)op1) = (npy_timedelta)result;
            }
            else {
                *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
            }
        }
    }
}

/* Note: Assuming 'q' == NPY_LONGLONG */
NPY_NO_EXPORT void
TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        const npy_int64 in2 = *(npy_int64 *)ip2;
        if (in1 == NPY_DATETIME_NAT || in2 == 0) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            *((npy_timedelta *)op1) = in1 / in2;
        }
    }
}

NPY_NO_EXPORT void
TIMEDELTA_md_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        const double in2 = *(double *)ip2;
        if (in1 == NPY_DATETIME_NAT) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            double result = in1 / in2;
            if (npy_isfinite(result)) {
                *((npy_timedelta *)op1) = (npy_timedelta)result;
            }
            else {
                *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
            }
        }
    }
}

NPY_NO_EXPORT void
TIMEDELTA_mm_d_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        const npy_timedelta in2 = *(npy_timedelta *)ip2;
        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
            *((double *)op1) = NPY_NAN;
        }
        else {
            *((double *)op1) = (double)in1 / (double)in2;
        }
    }
}

NPY_NO_EXPORT void
TIMEDELTA_mm_m_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        const npy_timedelta in2 = *(npy_timedelta *)ip2;
        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
        }
        else {
            if (in2 == 0) {
                npy_set_floatstatus_divbyzero();
                *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
            }
            else {
                /* handle mixed case the way Python does */
                const npy_timedelta rem = in1 % in2;
                if ((in1 > 0) == (in2 > 0) || rem == 0) {
                    *((npy_timedelta *)op1) = rem;
                }
                else {
                    *((npy_timedelta *)op1) = rem + in2;
                }
            }
        }
    }
}

NPY_NO_EXPORT void
TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        const npy_timedelta in2 = *(npy_timedelta *)ip2;
        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
            npy_set_floatstatus_invalid();
            *((npy_int64 *)op1) = 0;
        }
        else if (in2 == 0) {
            npy_set_floatstatus_divbyzero();
            *((npy_int64 *)op1) = 0;
        }
        else {
            if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) {
                *((npy_int64 *)op1) = in1/in2 - 1;
            }
            else {
                *((npy_int64 *)op1) = in1/in2;
            }
        }
    }
}

NPY_NO_EXPORT void
TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP_TWO_OUT {
        const npy_timedelta in1 = *(npy_timedelta *)ip1;
        const npy_timedelta in2 = *(npy_timedelta *)ip2;
        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
            npy_set_floatstatus_invalid();
            *((npy_int64 *)op1) = 0;
            *((npy_timedelta *)op2) = NPY_DATETIME_NAT;
        }
        else if (in2 == 0) {
            npy_set_floatstatus_divbyzero();
            *((npy_int64 *)op1) = 0;
            *((npy_timedelta *)op2) = NPY_DATETIME_NAT;
        }
        else {
            const npy_int64 quo = in1 / in2;
            const npy_timedelta rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((npy_int64 *)op1) = quo;
                *((npy_timedelta *)op2) = rem;
            }
            else {
                *((npy_int64 *)op1) = quo - 1;
                *((npy_timedelta *)op2) = rem + in2;
            }
        }
    }
}

/*
 *****************************************************************************
 **                             FLOAT LOOPS                                 **
 *****************************************************************************
 */

/**begin repeat
 * Float types
 *  #type = npy_float, npy_double#
 *  #TYPE = FLOAT, DOUBLE#
 *  #scalarf = npy_sqrtf, npy_sqrt#
 */

NPY_NO_EXPORT void
@TYPE@_sqrt(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (!run_unary_simd_sqrt_@TYPE@(args, dimensions, steps)) {
        UNARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            *(@type@ *)op1 = @scalarf@(in1);
        }
    }
}

/**end repeat**/

/**begin repeat
 *  #func = rint, ceil, floor, trunc#
 *  #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
 */

/**begin repeat1
*  #TYPE = FLOAT, DOUBLE#
*  #type = npy_float, npy_double#
*  #typesub = f, #
*/

NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    UNARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        *(@type@ *)op1 = @scalarf@@typesub@(in1);
    }
}


/**end repeat1**/
/**end repeat**/

/**begin repeat
 *  #func = sin, cos, exp, log#
 *  #scalarf = npy_sinf, npy_cosf, npy_expf, npy_logf#
 */

NPY_NO_EXPORT NPY_GCC_OPT_3 void
FLOAT_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    UNARY_LOOP {
	const npy_float in1 = *(npy_float *)ip1;
	*(npy_float *)op1 = @scalarf@(in1);
    }
}

/**end repeat**/

/**begin repeat
 * #isa = avx512f, fma#
 * #ISA = AVX512F, FMA#
 * #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS#
 */

/**begin repeat1
 *  #TYPE = FLOAT, DOUBLE#
 *  #type = npy_float, npy_double#
 *  #typesub = f, #
 */

NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_sqrt_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    if (!run_unary_@isa@_sqrt_@TYPE@(args, dimensions, steps)) {
        UNARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            *(@type@ *)op1 = npy_sqrt@typesub@(in1);
        }
    }
}

NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_absolute_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    if (!run_unary_@isa@_absolute_@TYPE@(args, dimensions, steps)) {
        UNARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            const @type@ tmp = in1 > 0 ? in1 : -in1;
            /* add 0 to clear -0.0 */
            *((@type@ *)op1) = tmp + 0;
        }
    }
    npy_clear_floatstatus_barrier((char*)dimensions);
}

NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_square_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    if (!run_unary_@isa@_square_@TYPE@(args, dimensions, steps)) {
        UNARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            *(@type@ *)op1 = in1*in1;
        }
    }
}

NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_reciprocal_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    if (!run_unary_@isa@_reciprocal_@TYPE@(args, dimensions, steps)) {
        UNARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            *(@type@ *)op1 = 1.0f/in1;
        }
    }
}

/**begin repeat2
 *  #func = rint, ceil, floor, trunc#
 *  #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
 */

NPY_NO_EXPORT NPY_GCC_OPT_3 void
@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    if (!run_unary_@isa@_@func@_@TYPE@(args, dimensions, steps)) {
        UNARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            *(@type@ *)op1 = @scalarf@@typesub@(in1);
        }
    }
}

/**end repeat2**/
/**end repeat1**/

/**begin repeat1
 *  #func = exp, log#
 *  #scalarf = npy_expf, npy_logf#
 */

NPY_NO_EXPORT NPY_GCC_OPT_3 void
FLOAT_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    if (!run_unary_@isa@_@func@_FLOAT(args, dimensions, steps)) {
        UNARY_LOOP {
            /*
             * We use the AVX function to compute exp/log for scalar elements as well.
             * This is needed to ensure the output of strided and non-strided
             * cases match. SIMD code handles strided input cases, but not
             * strided output.
             */
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
            @ISA@_@func@_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0]);
#else
            const npy_float in1 = *(npy_float *)ip1;
            *(npy_float *)op1 = @scalarf@(in1);
#endif
        }
    }
}

/**end repeat1**/

/**begin repeat1
 *  #func = cos, sin#
 *  #enum = npy_compute_cos, npy_compute_sin#
 *  #scalarf = npy_cosf, npy_sinf#
 */

NPY_NO_EXPORT NPY_GCC_OPT_3 void
FLOAT_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    if (!run_unary_@isa@_sincos_FLOAT(args, dimensions, steps, @enum@)) {
        UNARY_LOOP {
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
            @ISA@_sincos_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0], @enum@);
#else
	        const npy_float in1 = *(npy_float *)ip1;
	        *(npy_float *)op1 = @scalarf@(in1);
#endif
        }
    }
}

/**end repeat1**/
/**end repeat**/


/**begin repeat
 * Float types
 *  #type = npy_float, npy_double, npy_longdouble, npy_float#
 *  #dtype = npy_float, npy_double, npy_longdouble, npy_half#
 *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE, HALF#
 *  #c = f, , l, #
 *  #C = F, , L, #
 *  #trf = , , , npy_half_to_float#
 */

/*
 * Pairwise summation, rounding error O(lg n) instead of O(n).
 * The recursion depth is O(lg n) as well.
 * when updating also update similar complex floats summation
 */
static @type@
pairwise_sum_@TYPE@(char *a, npy_intp n, npy_intp stride)
{
    if (n < 8) {
        npy_intp i;
        @type@ res = 0.;

        for (i = 0; i < n; i++) {
            res += @trf@(*((@dtype@*)(a + i * stride)));
        }
        return res;
    }
    else if (n <= PW_BLOCKSIZE) {
        npy_intp i;
        @type@ r[8], res;

        /*
         * sum a block with 8 accumulators
         * 8 times unroll reduces blocksize to 16 and allows vectorization with
         * avx without changing summation ordering
         */
        r[0] = @trf@(*((@dtype@ *)(a + 0 * stride)));
        r[1] = @trf@(*((@dtype@ *)(a + 1 * stride)));
        r[2] = @trf@(*((@dtype@ *)(a + 2 * stride)));
        r[3] = @trf@(*((@dtype@ *)(a + 3 * stride)));
        r[4] = @trf@(*((@dtype@ *)(a + 4 * stride)));
        r[5] = @trf@(*((@dtype@ *)(a + 5 * stride)));
        r[6] = @trf@(*((@dtype@ *)(a + 6 * stride)));
        r[7] = @trf@(*((@dtype@ *)(a + 7 * stride)));

        for (i = 8; i < n - (n % 8); i += 8) {
            /* small blocksizes seems to mess with hardware prefetch */
            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(@dtype@))*stride, 0, 3);
            r[0] += @trf@(*((@dtype@ *)(a + (i + 0) * stride)));
            r[1] += @trf@(*((@dtype@ *)(a + (i + 1) * stride)));
            r[2] += @trf@(*((@dtype@ *)(a + (i + 2) * stride)));
            r[3] += @trf@(*((@dtype@ *)(a + (i + 3) * stride)));
            r[4] += @trf@(*((@dtype@ *)(a + (i + 4) * stride)));
            r[5] += @trf@(*((@dtype@ *)(a + (i + 5) * stride)));
            r[6] += @trf@(*((@dtype@ *)(a + (i + 6) * stride)));
            r[7] += @trf@(*((@dtype@ *)(a + (i + 7) * stride)));
        }

        /* accumulate now to avoid stack spills for single peel loop */
        res = ((r[0] + r[1]) + (r[2] + r[3])) +
              ((r[4] + r[5]) + (r[6] + r[7]));

        /* do non multiple of 8 rest */
        for (; i < n; i++) {
            res += @trf@(*((@dtype@ *)(a + i * stride)));
        }
        return res;
    }
    else {
        /* divide by two but avoid non-multiples of unroll factor */
        npy_intp n2 = n / 2;

        n2 -= n2 % 8;
        return pairwise_sum_@TYPE@(a, n2, stride) +
               pairwise_sum_@TYPE@(a + n2 * stride, n - n2, stride);
    }
}

/**end repeat**/

/**begin repeat
 * Float types
 *  #type = npy_float, npy_double, npy_longdouble#
 *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
 *  #c = f, , l#
 *  #C = F, , L#
 */

/**begin repeat1
 * Arithmetic
 * # kind = add, subtract, multiply, divide#
 * # OP = +, -, *, /#
 * # PW = 1, 0, 0, 0#
 */
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
#if @PW@
        @type@ * iop1 = (@type@ *)args[0];
        npy_intp n = dimensions[0];

        *iop1 @OP@= pairwise_sum_@TYPE@(args[1], n, steps[1]);
#else
        BINARY_REDUCE_LOOP(@type@) {
            io1 @OP@= *(@type@ *)ip2;
        }
        *((@type@ *)iop1) = io1;
#endif
    }
    else if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
        BINARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            const @type@ in2 = *(@type@ *)ip2;
            *((@type@ *)op1) = in1 @OP@ in2;
        }
    }
}
/**end repeat1**/

/**begin repeat1
 * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
 *        logical_and, logical_or#
 * #OP = ==, !=, <, <=, >, >=, &&, ||#
 */
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
        BINARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            const @type@ in2 = *(@type@ *)ip2;
            *((npy_bool *)op1) = in1 @OP@ in2;
        }
    }
    npy_clear_floatstatus_barrier((char*)dimensions);
}
/**end repeat1**/

NPY_NO_EXPORT void
@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const int t1 = !!*(@type@ *)ip1;
        const int t2 = !!*(@type@ *)ip2;
        *((npy_bool *)op1) = (t1 != t2);
    }
}

NPY_NO_EXPORT void
@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        *((npy_bool *)op1) = !in1;
    }
}

/**begin repeat1
 * #kind = isnan, isinf, isfinite, signbit#
 * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit#
 **/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (!run_@kind@_simd_@TYPE@(args, dimensions, steps)) {
        UNARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            *((npy_bool *)op1) = @func@(in1) != 0;
        }
    }
    npy_clear_floatstatus_barrier((char*)dimensions);
}
/**end repeat1**/

NPY_NO_EXPORT void
@TYPE@_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        *((@type@ *)op1) = npy_spacing@c@(in1);
    }
}

NPY_NO_EXPORT void
@TYPE@_copysign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        *((@type@ *)op1)= npy_copysign@c@(in1, in2);
    }
}

NPY_NO_EXPORT void
@TYPE@_nextafter(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        *((@type@ *)op1)= npy_nextafter@c@(in1, in2);
    }
}

/**begin repeat1
 * #kind = maximum, minimum#
 * #OP =  >=, <=#
 **/
NPY_NO_EXPORT void
@TYPE@_@kind@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    /*  */
    if (IS_BINARY_REDUCE) {
        if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) {
            BINARY_REDUCE_LOOP(@type@) {
                const @type@ in2 = *(@type@ *)ip2;
                /* Order of operations important for MSVC 2015 */
                io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2;
            }
            *((@type@ *)iop1) = io1;
        }
    }
    else {
        if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
            BINARY_LOOP {
                @type@ in1 = *(@type@ *)ip1;
                const @type@ in2 = *(@type@ *)ip2;
                /* Order of operations important for MSVC 2015 */
                in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2;
                *((@type@ *)op1) = in1;
            }
        }
    }
    npy_clear_floatstatus_barrier((char*)dimensions);
}

NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    /*  */
    if (IS_BINARY_REDUCE) {
        if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) {
            BINARY_REDUCE_LOOP(@type@) {
                const @type@ in2 = *(@type@ *)ip2;
                /* Order of operations important for MSVC 2015 */
                io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2;
            }
            *((@type@ *)iop1) = io1;
        }
    }
    else {
        BINARY_LOOP {
            @type@ in1 = *(@type@ *)ip1;
            const @type@ in2 = *(@type@ *)ip2;
            /* Order of operations important for MSVC 2015 */
            in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2;
            *((@type@ *)op1) = in1;
        }
    }
    npy_clear_floatstatus_barrier((char*)dimensions);
}
/**end repeat1**/

/**begin repeat1
 * #kind = fmax, fmin#
 * #OP =  >=, <=#
 **/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    /*  */
    if (IS_BINARY_REDUCE) {
        BINARY_REDUCE_LOOP(@type@) {
            const @type@ in2 = *(@type@ *)ip2;
            /* Order of operations important for MSVC 2015 */
            io1 = (io1 @OP@ in2 || npy_isnan(in2)) ? io1 : in2;
        }
        *((@type@ *)iop1) = io1;
    }
    else {
        BINARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            const @type@ in2 = *(@type@ *)ip2;
            /* Order of operations important for MSVC 2015 */
            *((@type@ *)op1) = (in1 @OP@ in2 || npy_isnan(in2)) ? in1 : in2;
        }
    }
    npy_clear_floatstatus_barrier((char*)dimensions);
}
/**end repeat1**/

NPY_NO_EXPORT void
@TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        @type@ mod;
        *((@type@ *)op1) = npy_divmod@c@(in1, in2, &mod);
    }
}

NPY_NO_EXPORT void
@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        npy_divmod@c@(in1, in2, (@type@ *)op1);
    }
}

NPY_NO_EXPORT void
@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP_TWO_OUT {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        *((@type@ *)op1) = npy_divmod@c@(in1, in2, (@type@ *)op2);
    }
}

NPY_NO_EXPORT void
@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    char * margs[] = {args[0], args[0], args[1]};
    npy_intp msteps[] = {steps[0], steps[0], steps[1]};
    if (!run_binary_simd_multiply_@TYPE@(margs, dimensions, msteps)) {
        UNARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            *((@type@ *)op1) = in1*in1;
        }
    }
}

NPY_NO_EXPORT void
@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    @type@ one = 1.@c@;
    char * margs[] = {(char*)&one, args[0], args[1]};
    npy_intp msteps[] = {0, steps[0], steps[1]};
    if (!run_binary_simd_divide_@TYPE@(margs, dimensions, msteps)) {
        UNARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            *((@type@ *)op1) = 1/in1;
        }
    }
}

NPY_NO_EXPORT void
@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    OUTPUT_LOOP {
        *((@type@ *)op1) = 1;
    }
}

NPY_NO_EXPORT void
@TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        *((@type@ *)op1) = in1;
    }
}

NPY_NO_EXPORT void
@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (!run_unary_simd_absolute_@TYPE@(args, dimensions, steps)) {
        UNARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            const @type@ tmp = in1 > 0 ? in1 : -in1;
            /* add 0 to clear -0.0 */
            *((@type@ *)op1) = tmp + 0;
        }
    }
    npy_clear_floatstatus_barrier((char*)dimensions);
}

NPY_NO_EXPORT void
@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (!run_unary_simd_negative_@TYPE@(args, dimensions, steps)) {
        UNARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            *((@type@ *)op1) = -in1;
        }
    }
}

NPY_NO_EXPORT void
@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        *((@type@ *)op1) = +in1;
    }
}

NPY_NO_EXPORT void
@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    /* Sign of nan is nan */
    UNARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        *((@type@ *)op1) = in1 > 0 ? 1 : (in1 < 0 ? -1 : (in1 == 0 ? 0 : in1));
    }
    npy_clear_floatstatus_barrier((char*)dimensions);
}

NPY_NO_EXPORT void
@TYPE@_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_TWO_OUT {
        const @type@ in1 = *(@type@ *)ip1;
        *((@type@ *)op1) = npy_modf@c@(in1, (@type@ *)op2);
    }
}

NPY_NO_EXPORT void
@TYPE@_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_TWO_OUT {
        const @type@ in1 = *(@type@ *)ip1;
        *((@type@ *)op1) = npy_frexp@c@(in1, (int *)op2);
    }
}

NPY_NO_EXPORT void
@TYPE@_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const int in2 = *(int *)ip2;
        *((@type@ *)op1) = npy_ldexp@c@(in1, in2);
    }
}

NPY_NO_EXPORT void
@TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    /*
     * Additional loop to handle npy_long integer inputs (cf. #866, #1633).
     * npy_long != npy_int on many 64-bit platforms, so we need this second loop
     * to handle the default integer type.
     */
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const long in2 = *(long *)ip2;
        if (((int)in2) == in2) {
            /* Range OK */
            *((@type@ *)op1) = npy_ldexp@c@(in1, ((int)in2));
        }
        else {
            /*
             * Outside npy_int range -- also ldexp will overflow in this case,
             * given that exponent has less bits than npy_int.
             */
            if (in2 > 0) {
                *((@type@ *)op1) = npy_ldexp@c@(in1, NPY_MAX_INT);
            }
            else {
                *((@type@ *)op1) = npy_ldexp@c@(in1, NPY_MIN_INT);
            }
        }
    }
}

#define @TYPE@_true_divide @TYPE@_divide

/**end repeat**/

/*
 *****************************************************************************
 **                          HALF-FLOAT LOOPS                               **
 *****************************************************************************
 */


/**begin repeat
 * Arithmetic
 * # kind = add, subtract, multiply, divide#
 * # OP = +, -, *, /#
 * # PW = 1, 0, 0, 0#
 */
NPY_NO_EXPORT void
HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
        char *iop1 = args[0];
        float io1 = npy_half_to_float(*(npy_half *)iop1);
#if @PW@
        npy_intp n = dimensions[0];

        io1 @OP@= pairwise_sum_HALF(args[1], n, steps[1]);
#else
        BINARY_REDUCE_LOOP_INNER {
            io1 @OP@= npy_half_to_float(*(npy_half *)ip2);
        }
#endif
        *((npy_half *)iop1) = npy_float_to_half(io1);
    }
    else {
        BINARY_LOOP {
            const float in1 = npy_half_to_float(*(npy_half *)ip1);
            const float in2 = npy_half_to_float(*(npy_half *)ip2);
            *((npy_half *)op1) = npy_float_to_half(in1 @OP@ in2);
        }
    }
}
/**end repeat**/

#define _HALF_LOGICAL_AND(a,b) (!npy_half_iszero(a) && !npy_half_iszero(b))
#define _HALF_LOGICAL_OR(a,b) (!npy_half_iszero(a) || !npy_half_iszero(b))
/**begin repeat
 * #kind = equal, not_equal, less, less_equal, greater,
 *         greater_equal, logical_and, logical_or#
 * #OP = npy_half_eq, npy_half_ne, npy_half_lt, npy_half_le, npy_half_gt,
 *       npy_half_ge, _HALF_LOGICAL_AND, _HALF_LOGICAL_OR#
 */
NPY_NO_EXPORT void
HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        const npy_half in2 = *(npy_half *)ip2;
        *((npy_bool *)op1) = @OP@(in1, in2);
    }
}
/**end repeat**/
#undef _HALF_LOGICAL_AND
#undef _HALF_LOGICAL_OR

NPY_NO_EXPORT void
HALF_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const int in1 = !npy_half_iszero(*(npy_half *)ip1);
        const int in2 = !npy_half_iszero(*(npy_half *)ip2);
        *((npy_bool *)op1) = (in1 != in2);
    }
}

NPY_NO_EXPORT void
HALF_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        *((npy_bool *)op1) = npy_half_iszero(in1);
    }
}

/**begin repeat
 * #kind = isnan, isinf, isfinite, signbit#
 * #func = npy_half_isnan, npy_half_isinf, npy_half_isfinite, npy_half_signbit#
 **/
NPY_NO_EXPORT void
HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        *((npy_bool *)op1) = @func@(in1) != 0;
    }
    npy_clear_floatstatus_barrier((char*)dimensions);
}
/**end repeat**/

NPY_NO_EXPORT void
HALF_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        *((npy_half *)op1) = npy_half_spacing(in1);
    }
}

NPY_NO_EXPORT void
HALF_copysign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        const npy_half in2 = *(npy_half *)ip2;
        *((npy_half *)op1)= npy_half_copysign(in1, in2);
    }
}

NPY_NO_EXPORT void
HALF_nextafter(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        const npy_half in2 = *(npy_half *)ip2;
        *((npy_half *)op1)= npy_half_nextafter(in1, in2);
    }
}

/**begin repeat
 * #kind = maximum, minimum#
 * #OP =  npy_half_ge, npy_half_le#
 **/
NPY_NO_EXPORT void
HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    /*  */
    BINARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        const npy_half in2 = *(npy_half *)ip2;
        *((npy_half *)op1) = (@OP@(in1, in2) || npy_half_isnan(in1)) ? in1 : in2;
    }
    /* npy_half_isnan will never set floatstatus_invalid, so do not clear */
}
/**end repeat**/

/**begin repeat
 * #kind = fmax, fmin#
 * #OP =  npy_half_ge, npy_half_le#
 **/
NPY_NO_EXPORT void
HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    /*  */
    BINARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        const npy_half in2 = *(npy_half *)ip2;
        *((npy_half *)op1) = (@OP@(in1, in2) || npy_half_isnan(in2)) ? in1 : in2;
    }
    /* npy_half_isnan will never set floatstatus_invalid, so do not clear */
}
/**end repeat**/

NPY_NO_EXPORT void
HALF_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        const npy_half in2 = *(npy_half *)ip2;
        npy_half mod;
        *((npy_half *)op1) = npy_half_divmod(in1, in2, &mod);
    }
}

NPY_NO_EXPORT void
HALF_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        const npy_half in2 = *(npy_half *)ip2;
        npy_half_divmod(in1, in2, (npy_half *)op1);
    }
}

NPY_NO_EXPORT void
HALF_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP_TWO_OUT {
        const npy_half in1 = *(npy_half *)ip1;
        const npy_half in2 = *(npy_half *)ip2;
        *((npy_half *)op1) = npy_half_divmod(in1, in2, (npy_half *)op2);
    }
}

NPY_NO_EXPORT void
HALF_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    UNARY_LOOP {
        const float in1 = npy_half_to_float(*(npy_half *)ip1);
        *((npy_half *)op1) = npy_float_to_half(in1*in1);
    }
}

NPY_NO_EXPORT void
HALF_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    UNARY_LOOP {
        const float in1 = npy_half_to_float(*(npy_half *)ip1);
        *((npy_half *)op1) = npy_float_to_half(1/in1);
    }
}

NPY_NO_EXPORT void
HALF__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    OUTPUT_LOOP {
        *((npy_half *)op1) = NPY_HALF_ONE;
    }
}

NPY_NO_EXPORT void
HALF_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        *((npy_half *)op1) = in1;
    }
}

NPY_NO_EXPORT NPY_GCC_OPT_3 void
HALF_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_FAST(npy_half, npy_half, *out = in&0x7fffu);
}

NPY_NO_EXPORT void
HALF_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        *((npy_half *)op1) = in1^0x8000u;
    }
}

NPY_NO_EXPORT void
HALF_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        *((npy_half *)op1) = +in1;
    }
}

NPY_NO_EXPORT void
HALF_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    /* Sign of nan is nan */
    UNARY_LOOP {
        const npy_half in1 = *(npy_half *)ip1;
        *((npy_half *)op1) = npy_half_isnan(in1) ? in1 :
                    (((in1&0x7fffu) == 0) ? 0 :
                      (((in1&0x8000u) == 0) ? NPY_HALF_ONE : NPY_HALF_NEGONE));
    }
}

NPY_NO_EXPORT void
HALF_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    float temp;

    UNARY_LOOP_TWO_OUT {
        const float in1 = npy_half_to_float(*(npy_half *)ip1);
        *((npy_half *)op1) = npy_float_to_half(npy_modff(in1, &temp));
        *((npy_half *)op2) = npy_float_to_half(temp);
    }
}

NPY_NO_EXPORT void
HALF_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP_TWO_OUT {
        const float in1 = npy_half_to_float(*(npy_half *)ip1);
        *((npy_half *)op1) = npy_float_to_half(npy_frexpf(in1, (int *)op2));
    }
}

NPY_NO_EXPORT void
HALF_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const float in1 = npy_half_to_float(*(npy_half *)ip1);
        const int in2 = *(int *)ip2;
        *((npy_half *)op1) = npy_float_to_half(npy_ldexpf(in1, in2));
    }
}

NPY_NO_EXPORT void
HALF_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    /*
     * Additional loop to handle npy_long integer inputs (cf. #866, #1633).
     * npy_long != npy_int on many 64-bit platforms, so we need this second loop
     * to handle the default integer type.
     */
    BINARY_LOOP {
        const float in1 = npy_half_to_float(*(npy_half *)ip1);
        const long in2 = *(long *)ip2;
        if (((int)in2) == in2) {
            /* Range OK */
            *((npy_half *)op1) = npy_float_to_half(npy_ldexpf(in1, ((int)in2)));
        }
        else {
            /*
             * Outside npy_int range -- also ldexp will overflow in this case,
             * given that exponent has less bits than npy_int.
             */
            if (in2 > 0) {
                *((npy_half *)op1) = npy_float_to_half(npy_ldexpf(in1, NPY_MAX_INT));
            }
            else {
                *((npy_half *)op1) = npy_float_to_half(npy_ldexpf(in1, NPY_MIN_INT));
            }
        }
    }
}

#define HALF_true_divide HALF_divide


/*
 *****************************************************************************
 **                           COMPLEX LOOPS                                 **
 *****************************************************************************
 */

#define CGE(xr,xi,yr,yi) ((xr > yr && !npy_isnan(xi) && !npy_isnan(yi)) \
                          || (xr == yr && xi >= yi))
#define CLE(xr,xi,yr,yi) ((xr < yr && !npy_isnan(xi) && !npy_isnan(yi)) \
                          || (xr == yr && xi <= yi))
#define CGT(xr,xi,yr,yi) ((xr > yr && !npy_isnan(xi) && !npy_isnan(yi)) \
                          || (xr == yr && xi > yi))
#define CLT(xr,xi,yr,yi) ((xr < yr && !npy_isnan(xi) && !npy_isnan(yi)) \
                          || (xr == yr && xi < yi))
#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi)
#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi)

/**begin repeat
 * complex types
 * #TYPE = CFLOAT, CDOUBLE, CLONGDOUBLE#
 * #ftype = npy_float, npy_double, npy_longdouble#
 * #c = f, , l#
 * #C = F, , L#
 * #SIMD = 1, 1, 0#
 */

/* similar to pairwise sum of real floats */
static void
pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n,
                    npy_intp stride)
{
    assert(n % 2 == 0);
    if (n < 8) {
        npy_intp i;

        *rr = 0.;
        *ri = 0.;
        for (i = 0; i < n; i += 2) {
            *rr += *((@ftype@ *)(a + i * stride + 0));
            *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
        }
        return;
    }
    else if (n <= PW_BLOCKSIZE) {
        npy_intp i;
        @ftype@ r[8];

        /*
         * sum a block with 8 accumulators
         * 8 times unroll reduces blocksize to 16 and allows vectorization with
         * avx without changing summation ordering
         */
        r[0] = *((@ftype@ *)(a + 0 * stride));
        r[1] = *((@ftype@ *)(a + 0 * stride + sizeof(@ftype@)));
        r[2] = *((@ftype@ *)(a + 2 * stride));
        r[3] = *((@ftype@ *)(a + 2 * stride + sizeof(@ftype@)));
        r[4] = *((@ftype@ *)(a + 4 * stride));
        r[5] = *((@ftype@ *)(a + 4 * stride + sizeof(@ftype@)));
        r[6] = *((@ftype@ *)(a + 6 * stride));
        r[7] = *((@ftype@ *)(a + 6 * stride + sizeof(@ftype@)));

        for (i = 8; i < n - (n % 8); i += 8) {
            /* small blocksizes seems to mess with hardware prefetch */
            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(@ftype@))*stride, 0, 3);
            r[0] += *((@ftype@ *)(a + (i + 0) * stride));
            r[1] += *((@ftype@ *)(a + (i + 0) * stride + sizeof(@ftype@)));
            r[2] += *((@ftype@ *)(a + (i + 2) * stride));
            r[3] += *((@ftype@ *)(a + (i + 2) * stride + sizeof(@ftype@)));
            r[4] += *((@ftype@ *)(a + (i + 4) * stride));
            r[5] += *((@ftype@ *)(a + (i + 4) * stride + sizeof(@ftype@)));
            r[6] += *((@ftype@ *)(a + (i + 6) * stride));
            r[7] += *((@ftype@ *)(a + (i + 6) * stride + sizeof(@ftype@)));
        }

        /* accumulate now to avoid stack spills for single peel loop */
        *rr = ((r[0] + r[2]) + (r[4] + r[6]));
        *ri = ((r[1] + r[3]) + (r[5] + r[7]));

        /* do non multiple of 8 rest */
        for (; i < n; i+=2) {
            *rr += *((@ftype@ *)(a + i * stride + 0));
            *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
        }
        return;
    }
    else {
        /* divide by two but avoid non-multiples of unroll factor */
        @ftype@ rr1, ri1, rr2, ri2;
        npy_intp n2 = n / 2;

        n2 -= n2 % 8;
        pairwise_sum_@TYPE@(&rr1, &ri1, a, n2, stride);
        pairwise_sum_@TYPE@(&rr2, &ri2, a + n2 * stride, n - n2, stride);
        *rr = rr1 + rr2;
        *ri = ri1 + ri2;
        return;
    }
}


/**begin repeat1
 * arithmetic
 * #kind = add, subtract#
 * #OP = +, -#
 * #PW = 1, 0#
 */
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE && @PW@) {
        npy_intp n = dimensions[0];
        @ftype@ * or = ((@ftype@ *)args[0]);
        @ftype@ * oi = ((@ftype@ *)args[0]) + 1;
        @ftype@ rr, ri;

        pairwise_sum_@TYPE@(&rr, &ri, args[1], n * 2, steps[1] / 2);
        *or @OP@= rr;
        *oi @OP@= ri;
        return;
    }
    else {
        BINARY_LOOP {
            const @ftype@ in1r = ((@ftype@ *)ip1)[0];
            const @ftype@ in1i = ((@ftype@ *)ip1)[1];
            const @ftype@ in2r = ((@ftype@ *)ip2)[0];
            const @ftype@ in2i = ((@ftype@ *)ip2)[1];
            ((@ftype@ *)op1)[0] = in1r @OP@ in2r;
            ((@ftype@ *)op1)[1] = in1i @OP@ in2i;
        }
    }
}
/**end repeat1**/

NPY_NO_EXPORT void
@TYPE@_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
        ((@ftype@ *)op1)[0] = in1r*in2r - in1i*in2i;
        ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r;
    }
}

NPY_NO_EXPORT void
@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
        const @ftype@ in2r_abs = npy_fabs@c@(in2r);
        const @ftype@ in2i_abs = npy_fabs@c@(in2i);
        if (in2r_abs >= in2i_abs) {
            if (in2r_abs == 0 && in2i_abs == 0) {
                /* divide by zero should yield a complex inf or nan */
                ((@ftype@ *)op1)[0] = in1r/in2r_abs;
                ((@ftype@ *)op1)[1] = in1i/in2i_abs;
            }
            else {
                const @ftype@ rat = in2i/in2r;
                const @ftype@ scl = 1.0@c@/(in2r + in2i*rat);
                ((@ftype@ *)op1)[0] = (in1r + in1i*rat)*scl;
                ((@ftype@ *)op1)[1] = (in1i - in1r*rat)*scl;
            }
        }
        else {
            const @ftype@ rat = in2r/in2i;
            const @ftype@ scl = 1.0@c@/(in2i + in2r*rat);
            ((@ftype@ *)op1)[0] = (in1r*rat + in1i)*scl;
            ((@ftype@ *)op1)[1] = (in1i*rat - in1r)*scl;
        }
    }
}

#if @SIMD@
NPY_NO_EXPORT void
@TYPE@_add_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func)
{
    if (IS_BINARY_REDUCE) {
        @TYPE@_add(args, dimensions, steps, func);
    }
    else if (!run_binary_avx512f_add_@TYPE@(args, dimensions, steps)) {
        @TYPE@_add(args, dimensions, steps, func);
    }
}

/**begin repeat1
 * arithmetic
 * #kind = subtract, multiply#
 */
NPY_NO_EXPORT void
@TYPE@_@kind@_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func)
{
    if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
        @TYPE@_@kind@(args, dimensions, steps, func);
    }
}
/**end repeat1**/
#endif

NPY_NO_EXPORT void
@TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
        if (npy_fabs@c@(in2r) >= npy_fabs@c@(in2i)) {
            const @ftype@ rat = in2i/in2r;
            ((@ftype@ *)op1)[0] = npy_floor@c@((in1r + in1i*rat)/(in2r + in2i*rat));
            ((@ftype@ *)op1)[1] = 0;
        }
        else {
            const @ftype@ rat = in2r/in2i;
            ((@ftype@ *)op1)[0] = npy_floor@c@((in1r*rat + in1i)/(in2i + in2r*rat));
            ((@ftype@ *)op1)[1] = 0;
        }
    }
}

/**begin repeat1
 * #kind= greater, greater_equal, less, less_equal, equal, not_equal#
 * #OP = CGT, CGE, CLT, CLE, CEQ, CNE#
 */
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
        *((npy_bool *)op1) = @OP@(in1r,in1i,in2r,in2i);
    }
}
/**end repeat1**/

/**begin repeat1
   #kind = logical_and, logical_or#
   #OP1 = ||, ||#
   #OP2 = &&, ||#
*/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
        *((npy_bool *)op1) = (in1r @OP1@ in1i) @OP2@ (in2r @OP1@ in2i);
    }
}
/**end repeat1**/

NPY_NO_EXPORT void
@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
        const npy_bool tmp1 = (in1r || in1i);
        const npy_bool tmp2 = (in2r || in2i);
        *((npy_bool *)op1) = tmp1 != tmp2;
    }
}

NPY_NO_EXPORT void
@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        *((npy_bool *)op1) = !(in1r || in1i);
    }
}

/**begin repeat1
 * #kind = isnan, isinf, isfinite#
 * #func = npy_isnan, npy_isinf, npy_isfinite#
 * #OP = ||, ||, &&#
 **/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        *((npy_bool *)op1) = @func@(in1r) @OP@ @func@(in1i);
    }
    npy_clear_floatstatus_barrier((char*)dimensions);
}
/**end repeat1**/

NPY_NO_EXPORT void
@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    UNARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        ((@ftype@ *)op1)[0] = in1r*in1r - in1i*in1i;
        ((@ftype@ *)op1)[1] = in1r*in1i + in1i*in1r;
    }
}

NPY_NO_EXPORT void
@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    UNARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        if (npy_fabs@c@(in1i) <= npy_fabs@c@(in1r)) {
            const @ftype@ r = in1i/in1r;
            const @ftype@ d = in1r + in1i*r;
            ((@ftype@ *)op1)[0] = 1/d;
            ((@ftype@ *)op1)[1] = -r/d;
        } else {
            const @ftype@ r = in1r/in1i;
            const @ftype@ d = in1r*r + in1i;
            ((@ftype@ *)op1)[0] = r/d;
            ((@ftype@ *)op1)[1] = -1/d;
        }
    }
}

NPY_NO_EXPORT void
@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
    OUTPUT_LOOP {
        ((@ftype@ *)op1)[0] = 1;
        ((@ftype@ *)op1)[1] = 0;
    }
}

NPY_NO_EXPORT void
@TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
    UNARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        ((@ftype@ *)op1)[0] = in1r;
        ((@ftype@ *)op1)[1] = -in1i;
    }
}

NPY_NO_EXPORT void
@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        *((@ftype@ *)op1) = npy_hypot@c@(in1r, in1i);
    }
}

#if @SIMD@
/**begin repeat1
 * arithmetic
 * #kind = conjugate, square, absolute#
 */
NPY_NO_EXPORT void
@TYPE@_@kind@_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func)
{
    if (!run_unary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
        @TYPE@_@kind@(args, dimensions, steps, func);
    }
}
/**end repeat1**/
#endif

NPY_NO_EXPORT void
@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    UNARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        *((@ftype@ *)op1) = npy_atan2@c@(in1i, in1r);
    }
}

NPY_NO_EXPORT void
@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    /* fixme: sign of nan is currently 0 */
    UNARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        ((@ftype@ *)op1)[0] = CGT(in1r, in1i, 0.0, 0.0) ?  1 :
                            (CLT(in1r, in1i, 0.0, 0.0) ? -1 :
                            (CEQ(in1r, in1i, 0.0, 0.0) ?  0 : NPY_NAN@C@));
        ((@ftype@ *)op1)[1] = 0;
    }
}

/**begin repeat1
 * #kind = maximum, minimum#
 * #OP = CGE, CLE#
 */
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        @ftype@ in1r = ((@ftype@ *)ip1)[0];
        @ftype@ in1i = ((@ftype@ *)ip1)[1];
        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
        if ( !(npy_isnan(in1r) || npy_isnan(in1i) || @OP@(in1r, in1i, in2r, in2i))) {
            in1r = in2r;
            in1i = in2i;
        }
        ((@ftype@ *)op1)[0] = in1r;
        ((@ftype@ *)op1)[1] = in1i;
    }
    npy_clear_floatstatus_barrier((char*)dimensions);
}
/**end repeat1**/

/**begin repeat1
 * #kind = fmax, fmin#
 * #OP = CGE, CLE#
 */
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    BINARY_LOOP {
        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
        if (npy_isnan(in2r) || npy_isnan(in2i) || @OP@(in1r, in1i, in2r, in2i)) {
            ((@ftype@ *)op1)[0] = in1r;
            ((@ftype@ *)op1)[1] = in1i;
        }
        else {
            ((@ftype@ *)op1)[0] = in2r;
            ((@ftype@ *)op1)[1] = in2i;
        }
    }
    npy_clear_floatstatus_barrier((char*)dimensions);
}
/**end repeat1**/

#define @TYPE@_true_divide @TYPE@_divide

/**end repeat**/

#undef CGE
#undef CLE
#undef CGT
#undef CLT
#undef CEQ
#undef CNE

/*
 *****************************************************************************
 **                            OBJECT LOOPS                                 **
 *****************************************************************************
 */

/**begin repeat
 * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
 * #OP = EQ, NE, GT, GE, LT, LE#
 * #identity = NPY_TRUE, NPY_FALSE, -1*4#
 */

/**begin repeat1
 * #suffix = , _OO_O#
 * #as_bool = 1, 0#
 */
NPY_NO_EXPORT void
OBJECT@suffix@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
    BINARY_LOOP {
        PyObject *ret_obj;
        PyObject *in1 = *(PyObject **)ip1;
        PyObject *in2 = *(PyObject **)ip2;

        in1 = in1 ? in1 : Py_None;
        in2 = in2 ? in2 : Py_None;

        /*
         * Do not use RichCompareBool because it includes an identity check for
         * == and !=. This is wrong for elementwise behaviour, since it means
         * that NaN can be equal to NaN and an array is equal to itself.
         */
        ret_obj = PyObject_RichCompare(in1, in2, Py_@OP@);
        if (ret_obj == NULL) {
            return;
        }
#if @as_bool@
        {
            int ret = PyObject_IsTrue(ret_obj);
            Py_DECREF(ret_obj);
            if (ret == -1) {
                return;
            }
            *((npy_bool *)op1) = (npy_bool)ret;
        }
#else
        *((PyObject **)op1) = ret_obj;
#endif
    }
}
/**end repeat1**/
/**end repeat**/

NPY_NO_EXPORT void
OBJECT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    PyObject *zero = PyLong_FromLong(0);

    UNARY_LOOP {
        PyObject *in1 = *(PyObject **)ip1;
        PyObject **out = (PyObject **)op1;
        PyObject *ret = NULL;
        int v;

        if (in1 == NULL) {
            in1 = Py_None;
        }

        if ((v = PyObject_RichCompareBool(in1, zero, Py_LT)) == 1) {
            ret = PyLong_FromLong(-1);
        }
        else if (v == 0 &&
                (v = PyObject_RichCompareBool(in1, zero, Py_GT)) == 1) {
            ret = PyLong_FromLong(1);
        }
        else if (v == 0 &&
                (v = PyObject_RichCompareBool(in1, zero, Py_EQ)) == 1) {
            ret = PyLong_FromLong(0);
        }
        else if (v == 0) {
            /* in1 is NaN */
            PyErr_SetString(PyExc_TypeError,
                    "unorderable types for comparison");
        }

        if (ret == NULL) {
            break;
        }
        Py_XDECREF(*out);
        *out = ret;
    }
    Py_XDECREF(zero);
}

/*
 *****************************************************************************
 **                              END LOOPS                                  **
 *****************************************************************************
 */
